Import Packages

# if (!require("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")

#  BiocManager::install("oligo")
#  BiocManager::install("limma")
#  BiocManager::install("biomaRt")
#package for reading CEL files, will add more packages for further steps

Import Raw Data

csv_file <- read.table(file = r"(C:\Users\linde\rmarkdown\affy_processing\data_runsheets\GLDS_6_runsheet.csv)", 
                       sep=",",
                       header = TRUE,
                       check.names = FALSE
                       )
array_data_files <- csv_file["Path to Raw Data File"] #column name of paths

raw_data <- oligo::read.celfiles(array_data_files) #ExpressionFeatureSet Class
## Platform design info loaded.
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_CTRL_Rep1_GSM144733_raw1.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_CTRL_Rep2_GSM144734_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_CTRL_Rep3_GSM144735_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_IR_56Fe_Rep1_GSM144736_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_IR_56Fe_Rep2_GSM144737_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_IR_56Fe_Rep3_GSM144738_raw.CEL.gz

Verify Data

print("Verify Data step")
## [1] "Verify Data step"

Quality Analysis

QA: Visualization

density_plot <- hist(raw_data, transfo=log2, which=c("pm", "mm", "bg", "both", "all"),
                    nsample=10000, target = "core", main = "Density Plot")

## TODO: Add sample legend   
#legend(15, 0.35, legend=c(colnames(raw_table)), lty=1:2, cex = 1)
# legend() is not compatable

transfo : used to scale the data

which : defines specific probe types

nsample : sample size used to produce plot

target : specifies group of meta-probeset

main : main title

pseudoimage <- image(raw_data, transfo=log2)

MA_plot <- MAplot(raw_data)
## Warning in KernSmooth::bkde2D(x, bandwidth = bandwidth, gridsize = nbin, :
## Binning grid too coarse for current (small) bandwidth: consider increasing
## 'gridsize'

## Warning in KernSmooth::bkde2D(x, bandwidth = bandwidth, gridsize = nbin, :
## Binning grid too coarse for current (small) bandwidth: consider increasing
## 'gridsize'

### QA: Statistics

# BiocManager::install("AnnotationDbi")
# BiocManager::install("Biobase")

IQRray

# The IQRray statistic is obtained by ranking all the probe intensities from
# a given array and by computing the average rank for each probe set. The
# interquartile range (IQR) of the probe sets average ranks serves then as
# quality score. 
#Example of a low score: 122783.6
#   high score: 226124.6
library(methods)
library(AnnotationDbi)
library(Biobase)

    #data - Affybatch object obtained after reading in celfiles into R with function ReadAffy from package affy
    
    #obtaining intensity values for perfect match (pm) probes
pm_data<-pm(raw_data)
    
    #ranking probe intensities for every array 
rank_data<-apply(pm_data,2,rank)
    
    #obtaining names of probeset for every probe
probeNames<-probeNames(raw_data)
    
    #function computing IQR of mean probe ranks in probesets 
get_IQR<-function(rank_data,probeNames){round(IQR(sapply(split(rank_data,probeNames),mean)),digits=2)}
    
    #computing arIQR score
IQRray_score<-apply(rank_data,2,get_IQR,probeNames=probeNames)  

Normalize Data

norm <- normalize(raw_data)
## Normalizing... OK
raw_table <- exprs(raw_data)
# RMA is most commonly used method for preprocessing normalization of Affymetrix microarray data. There
# are three steps in the RMA algorithm: Convolution Background Correction, Quantile Normalization, and
# Summarization using Median Polish.
normRMA <- rma(raw_data)
## Background correcting
## Normalizing
## Calculating Expression
norm_probeset_table <- exprs(normRMA)  
#Human Exon array probeset to gene-level expression (biostars.org, https://www.biostars.org/p/271379/)
#Probe Level normalization
backgroundCorrection <- preprocessCore::rma.background.correct(raw_table)
probelevel_table <- preprocessCore::normalize.quantiles(backgroundCorrection, keep.names=TRUE)

QA Visualization for Normalized Data

MAplot_norm <- MAplot(normRMA)

densityPlot_norm <- hist(normRMA)

## PCA plot

pca <- prcomp(t(norm_probeset_table))
plot(pca$x[,1:2])

#how do you make a legend?
#legend('topright', col=1:6, legend = paste("Samples", 1:6), pch=20, bty='n', cex=.75)

DGE Analysis

## Getting Factor Values
fv <- data.frame(csv_file["Factor Value"])
all_factor_values <- fv[,1]
factor_values <- unique(all_factor_values)
all.factor.values <- make.names(all_factor_values)
factor.values <- make.names(factor_values)
## Creating Design
ph = raw_data@phenoData
ph@data[ ,2] = c(all.factor.values)
colnames(ph@data)[2]="source"
groups = ph@data$source
f = factor(groups,levels = factor.values)
design = model.matrix(~ 0 + f) 
colnames(design) <- factor.values
#Run for Data Sets with 2 Factor Values
fit = limma::lmFit(norm_probeset_table,design)
#Run for Data Sets with 3 Factor Values
fit = limma::lmFit(normRMA,design)
## Creating Constrast Matrix
fit.groups <- colnames(fit$design) [which(fit$assign == 1)]
fit.index <- which(levels(levels) %in% fit.groups)
fit.group.names <- gsub(" ", "_", sub(",", "_", unique(csv_file$'Factor Value')))
combos <- combn(fit.groups, 2)
combos.names <- combn(fit.group.names, 2)
contrasts <- c(paste(combos[1,], combos[2,], sep = "-", paste(combos[2,], combos[1,], sep="-")))
contrast.names <- c(paste(combos.names[1,], combos.names[2,], sep = "v"), paste(combos.names[2,], 
                    combos.names[1,], sep = "v"))
cont.matrix <- limma::makeContrasts(contrasts = contrasts, levels=design)
contrast.fit <- limma::contrasts.fit(fit, cont.matrix)
contrast.fit.eb <- limma::eBayes(contrast.fit)
log_fold_change <- contrast.fit.eb$coefficients[1:10,]
p_value <- contrast.fit.eb$p.value[1:10,]
adjusted_pvalue <- p.adjust(p_value,method="BH")
#results <- limma::decideTests(contrast.fit.eb, method = "separate", adjust.method = "BH", p.value = 0.05, lfc = 0.5)
#summary(limma::decideTests(contrast.fit.eb[,-1]))
#Error in h(simpleError(msg, call)) : 
#  error in evaluating the argument 'object' in selecting a method for function 'summary': 
#  subscript out of bounds

Gene Mapping

library(biomaRt)
library(dplyr)
listEnsembl()
##         biomart                version
## 1         genes      Ensembl Genes 107
## 2 mouse_strains      Mouse strains 107
## 3          snps  Ensembl Variation 107
## 4    regulation Ensembl Regulation 107
ensembl <- useEnsembl(biomart = "genes")
searchDatasets(mart = ensembl, pattern = "Mouse") 
##                    dataset                  description  version
## 107  mmurinus_gene_ensembl Mouse Lemur genes (Mmur_3.0) Mmur_3.0
## 108 mmusculus_gene_ensembl         Mouse genes (GRCm39)   GRCm39
ensembl <- useDataset(dataset = "mmusculus_gene_ensembl", mart = ensembl)
searchAttributes(mart = ensembl, pattern = "affy_")
##                      name                 description         page
## 95           affy_mg_u74a          AFFY MG U74A probe feature_page
## 96         affy_mg_u74av2        AFFY MG U74Av2 probe feature_page
## 97           affy_mg_u74b          AFFY MG U74B probe feature_page
## 98         affy_mg_u74bv2        AFFY MG U74Bv2 probe feature_page
## 99           affy_mg_u74c          AFFY MG U74C probe feature_page
## 100        affy_mg_u74cv2        AFFY MG U74Cv2 probe feature_page
## 101          affy_moe430a          AFFY MOE430A probe feature_page
## 102          affy_moe430b          AFFY MOE430B probe feature_page
## 103   affy_moex_1_0_st_v1   AFFY MoEx 1 0 st v1 probe feature_page
## 104 affy_mogene_1_0_st_v1 AFFY MoGene 1 0 st v1 probe feature_page
## 105 affy_mogene_2_1_st_v1 AFFY MoGene 2 1 st v1 probe feature_page
## 106      affy_mouse430a_2      AFFY Mouse430A 2 probe feature_page
## 107       affy_mouse430_2       AFFY Mouse430 2 probe feature_page
## 108        affy_mu11ksuba        AFFY Mu11KsubA probe feature_page
## 109        affy_mu11ksubb        AFFY Mu11KsubB probe feature_page
my_affy_ids <- c(head(rownames(norm_probeset_table), 2000))
results <- getBM(
    attributes = c(
        "affy_mogene_1_0_st_v1",
        "ensembl_gene_id",
        "uniprot_gn_symbol",
        "go_id"
        ),  # Columns we want, these were found using searchAttributes
    filters = "affy_mogene_1_0_st_v1", #  Filters (one or more) that should be used in the query
    values = my_affy_ids,  # Query IDs
    mart = ensembl)

Verify Normalization

print("Verify Normalization step")
## [1] "Verify Normalization step"

DGE Analysis

print("DGE Analysis step")
## [1] "DGE Analysis step"

Gene Annotations

print("Gene Annotations step")
## [1] "Gene Annotations step"

Code I will not be using but do not want to delete

# #trying to make a contrast matrix
# comparisons <- c()
# for (i in range(length(factor_values)))
# {
#     comparisons <- append(comparisons, fv[i]'-'fv[i+1])
# }
# #attempt 2
# counter <- 2
# for (fv in factor_values)
# {
#     comparisons <- append(comparisons, fv'-'fv[counter])
#     counter <- counter + 1
# }

# #gene mapping with adf
# adf <- read.delim(file=r"(C:\Users\linde\OneDrive\Desktop\GLDS-6_metadata_A-AFFY-43.adf.txt)", header=FALSE, skip=74)
# probeset_names <- c(adf[1])
# ensembl_id_with_emptys <- adf[12]
# refseq <- c(adf[2])
# ensembl_id <- lapply(ensembl_id_with_emptys, function(x) x[x!=""])
# probeset_ensembl_refseq <- data.frame(probeset_names, ensembl_id_with_emptys, refseq)
# ens <- c(probeset_ensembl_refseq[,2])
# probeset <- c(probeset_ensembl_refseq[,1])
# rs <- c(probeset_ensembl_refseq[,3])

# prfromnorm <- rownames(norm_probeset_table)

# #probeset_and_ensembl <- data.frame(probeset_names, ensembl_id)
# c <- 1
# probeset_refseq_dict <- c()
# for (i in probeset)
# {
#     probeset_refseq_dict[rs[c]]<-i
#     c <- c + 1
# }

# rsdupsbool <- duplicated(rs)
# dupsindex<-c()
# counter <- 1
# for (bool in rsdups)
# {
#     if (bool=="TRUE")
#     {
#         dupsindex <- append(dupsindex, counter)
#     }
#     counter <- counter + 1
# }
# duprefseq <- c()
# for (index in dupsindex)
# {
#     duprefseq <- append(duprefseq, rs[index])
# }

# #rsdups <- duplicated(rs)
# uniquers <- unique(rs)
# probeset_refseq_dict <- c()
#     #probeset_with_uniquerefseq[uniquers[probeset_with_refseq[ps]]]<-ps